RCV1-v2 Dataset source here


In [1]:
import logging

from sklearn.datasets import fetch_rcv1
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.pipeline import Pipeline
from sklearn import svm

logging.basicConfig()
rcv1 = fetch_rcv1()

In [2]:
training_samples = 23149

X_train = rcv1.data[:training_samples]
X_test = rcv1.data[training_samples:]

y_train = rcv1.target[:training_samples]
y_test = rcv1.target[training_samples:]

In [3]:
clf = OneVsRestClassifier(svm.LinearSVC(penalty='l1',tol=0.01,multi_class='crammer_singer',dual=False))

In [4]:
clf.fit(X_train,y_train)


/home/felipe/venv2/local/lib/python2.7/site-packages/sklearn/multiclass.py:70: UserWarning: Label not 49 is present in all training examples.
  str(classes[c]))
/home/felipe/venv2/local/lib/python2.7/site-packages/sklearn/multiclass.py:70: UserWarning: Label not 80 is present in all training examples.
  str(classes[c]))
Out[4]:
OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='crammer_singer', penalty='l1', random_state=None,
     tol=0.01, verbose=0),
          n_jobs=1)

In [5]:
y_pred = clf.predict(X_test)

current_score = f1_score(y_test,y_pred,average='micro')

In [6]:
current_score


Out[6]:
0.80843419139591599